import os
import sys
import pysam

dataset = sys.argv[1]
library = sys.argv[2]

assembly = "hg38"

targets = ("histone", "TERC", "MALAT1", "snhg", "mRNA", "lncRNA", "gencode")
counts = {}
for target in targets:
    counts[target] = 0

directory = "/osc-fs_home/mdehoon/Data/CASPARs/%s/Mapping" % dataset
filename = "%s.bam" % library
path = os.path.join(directory, filename)
print("Reading %s" % path)
alignments = pysam.Samfile(path)
print("Writing %s" % filename)
output = pysam.Samfile(filename, "wb", template=alignments)
if dataset == "MiSeq":
    for alignment1 in alignments:
        alignment2 = next(alignments)
        if alignment1.is_unmapped:
            assert alignment2.is_unmapped
        else:
            start1 = alignment1.reference_start
            end1 = alignment1.reference_end
            start2 = alignment2.reference_start
            end2 = alignment2.reference_end
            assert start1 < end1
            assert start2 < end2
            target = alignment1.get_tag("XT")
            if target in targets:
                annotation = "spliced_%s" % target
                length = alignment1.get_tag("XL")
                if alignment1.is_reverse:
                    assert not alignment2.is_reverse
                    start = start2
                    end = end1
                else:
                    assert alignment2.is_reverse
                    start = start1
                    end = end2
                if end - start > length:
                    alignment1.set_tag("XE", annotation)
                    counts[target] += 1
        output.write(alignment1)
        output.write(alignment2)
else:
    for alignment in alignments:
        if not alignment.is_unmapped:
            start = alignment.reference_start
            end = alignment.reference_end
            assert start < end
            target = alignment.get_tag("XT")
            if target in targets:
                length = alignment.get_tag("XL")
                if end - start > length:
                    annotation = "spliced_%s" % target
                    alignment.set_tag("XE", annotation)
                    counts[target] += 1
        output.write(alignment)
output.close()
alignments.close()
print("Number of annotated mature transcripts:")
for target in targets:
    print("%s: %d" % (target, counts[target]))
